{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "30fce67b-9382-4a4e-9ba1-26fef6ee35d4",
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "import json\n",
    "from tqdm import tqdm\n",
    "import csv\n",
    "from random import shuffle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "67b0501f-be44-449b-8495-f79bceff53ac",
   "metadata": {},
   "outputs": [],
   "source": [
    "data_dir = \"./test/\"\n",
    "cot_dir = \"./val/\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "7f227584-b3cd-460f-ac8e-3e3077a8390b",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 57/57 [00:00<00:00, 522.23it/s]\n"
     ]
    }
   ],
   "source": [
    "file_list = os.listdir(data_dir)\n",
    "all_data = list()\n",
    "for file_name in tqdm(file_list):\n",
    "    if file_name in [\"README.md\", \".ipynb_checkpoints\"]:\n",
    "        continue\n",
    "    task_name = file_name.replace(\"_test.csv\", \"\")\n",
    "    task_data = list()\n",
    "    with open(os.path.join(data_dir, file_name), \"r\", encoding=\"utf-8\") as fr:\n",
    "        csv_reader = csv.reader(fr)\n",
    "        for row in csv_reader:  # 将csv 文件中的数据保存到birth_data中\n",
    "            # birth_data.append(row)\n",
    "            task_data.append({\n",
    "                \"instruction\": \"Q: {}\\nOptions:\\n(A): {}\\n(B): {}\\n(C): {}\\n(D): {}\".format(row[0], row[1], row[2], row[3], row[4]),\n",
    "                \"answer\": \"({})\".format(row[5])\n",
    "            })\n",
    "\n",
    "    cot_examples = list()\n",
    "    with open(os.path.join(cot_dir, \"{}_val.csv\".format(task_name)), \"r\", encoding=\"utf-8\") as fr:\n",
    "        csv_reader = csv.reader(fr)\n",
    "        for row in csv_reader:  # 将csv 文件中的数据保存到birth_data中\n",
    "            # birth_data.append(row)\n",
    "            cot_examples.append(\n",
    "                \"Q: {}\\nOptions:\\n(A): {}\\n(B): {}\\n(C): {}\\n(D): {}\\nA: The answer is ({}).\".format(row[0], row[1], row[2], row[3], row[4], row[5])\n",
    "            )\n",
    "    shuffle(cot_examples)\n",
    "    cot_prompt = \"\\n\\n\".join(cot_examples[:5])\n",
    "     \n",
    "    for ei, example in enumerate(task_data):\n",
    "        all_data.append({\n",
    "            \"task_name\": task_name,\n",
    "            \"idx\": ei + 1,\n",
    "            \"instruction\": \"{}\\n\\n{}\\nA:\".format(cot_prompt, example[\"instruction\"]),\n",
    "            \"answer\": [example[\"answer\"]],\n",
    "        })"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "24e3b4ca-b3dd-485f-837c-d9378c9846a1",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Q: What is soft power?\n",
      "Options:\n",
      "(A): Use of coercive measures short of war, such as economic sanctions\n",
      "(B): Ways in which the values and culture of a country provide it with influence in the international system\n",
      "(C): Ineffective use of one's power\n",
      "(D): When states use carrots (incentives) instead of sticks (threats) to influence other countries.\n",
      "A: The answer is (B).\n",
      "\n",
      "Q: Which factor is cited by Revisionists as the primary cause of the Cold War?\n",
      "Options:\n",
      "(A): The threat posed by the Soviet Union\n",
      "(B): Domestic concerns of the US\n",
      "(C): Soviet ideology\n",
      "(D): None of the above\n",
      "A: The answer is (B).\n",
      "\n",
      "Q: According to Realists, what accounts for the onset of the Cold War?\n",
      "Options:\n",
      "(A): Ideological differences\n",
      "(B): A power vacuum\n",
      "(C): The expansionist nature of the Soviet Union\n",
      "(D): Both b and c\n",
      "A: The answer is (D).\n",
      "\n",
      "Q: Who is considered the “father” of containment?\n",
      "Options:\n",
      "(A): George Kennan\n",
      "(B): John Foster Dulles\n",
      "(C): Henry Kissinger\n",
      "(D): Dwight Eisenhower\n",
      "A: The answer is (A).\n",
      "\n",
      "Q: Who were the original five permanent members on the UN Security Council?\n",
      "Options:\n",
      "(A): United States, Soviet Union, Germany, France, and Great Britain\n",
      "(B): United States, Germany, France, Great Britain, and Japan\n",
      "(C): United States, Great Britain, Republic of China, India, and Brazil\n",
      "(D): United States, Soviet Union, France, Great Britain, and Republic of China\n",
      "A: The answer is (D).\n",
      "\n",
      "Q: What were the 'open-door notes'?\n",
      "Options:\n",
      "(A): An American declaration that the US was always open to Chinese immigration\n",
      "(B): An American proclamation that China should be divided up between the US, Japan and the European empires\n",
      "(C): An American proclamation that China should be open to US trade and missionaries\n",
      "(D): An American declaration of support for Chinese economic protectionism\n",
      "A:\n",
      "['(C)']\n"
     ]
    }
   ],
   "source": [
    "print(all_data[1400][\"instruction\"])\n",
    "print(all_data[1400][\"answer\"])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "ba980807-841b-4825-8d24-060d7b432a3f",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 14042/14042 [00:00<00:00, 61485.38it/s]\n"
     ]
    }
   ],
   "source": [
    "# save data\n",
    "with open(\"mmlu_dataset.json\", \"w\", encoding=\"utf-8\") as fw:\n",
    "    for example in tqdm(all_data):\n",
    "        fw.write(json.dumps(example) + \"\\n\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.5"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
